Set working directory and remove exisiting data

knitr::opts_chunk$set(echo = TRUE)

# Set working directory
setwd("C:\\Users\\ranch\\OneDrive\\cabi-trip-history-data")

# Remove exisiting data
rm(list=ls(all=TRUE))

Load libraries

# Load libraries
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(scales)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(ggmap)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Show the raw data

# Public data for Capital Bike Share program in Washington DC 
load("FinalDataNew.rdata")
head(FinalDataNew)
##   TotalDurationMs           StartDate             EndDate
## 1        51962000 2010-12-31 23:49:00 2011-01-01 14:15:00
## 2          514000 2010-12-31 23:37:00 2010-12-31 23:46:00
## 3          737000 2010-12-31 23:27:00 2010-12-31 23:39:00
## 4          953000 2010-12-31 23:21:00 2010-12-31 23:37:00
## 5         2179000 2010-12-31 23:20:00 2010-12-31 23:56:00
## 6          580000 2010-12-31 23:18:00 2010-12-31 23:28:00
##   StartStationNumber                StartStation EndStationNumber
## 1              31111             10th & U St NW             31111
## 2              31111             10th & U St NW             31202
## 3              31602    Park Rd & Holmead Pl NW             31401
## 4              31106 Calvert St & Woodley Pl NW             31401
## 5              31110   20th St & Florida Ave NW             31623
## 6              31107    Lamont & Mt Pleasant NW             31111
##                         EndStation BikeNumber AccountType
## 1                  10th & U St NW      W00771      Casual
## 2                  14th & R St NW      W01119  Registered
## 3          14th St & Spring Rd NW      W00973  Registered
## 4          14th St & Spring Rd NW      W00914  Registered
## 5 Columbus Circle / Union Station      W00859      Casual
## 6                  10th & U St NW      W01119  Registered

Show the dimension of the raw data

dim(FinalDataNew) # the length of the data is 13,623,103
## [1] 13623103        9

Summarize DC bike share data at different time for data visulization

# Load the starting date and time of all bike rentals
load("SummaryDateAndTime.rdata")
head(SummaryDateAndTime)
##   time_num  time hour day weekday month year
## 1     1429 23:49   23  31  Friday    12 2010
## 2     1417 23:37   23  31  Friday    12 2010
## 3     1407 23:27   23  31  Friday    12 2010
## 4     1401 23:21   23  31  Friday    12 2010
## 5     1400 23:20   23  31  Friday    12 2010
## 6     1398 23:18   23  31  Friday    12 2010
# Count rental event by year
year_summary <- ddply(SummaryDateAndTime,.(year),summarise,N = length(year))

# Count rental event by month and hour
month_summary <- ddply(SummaryDateAndTime,.(month,hour),summarise, N = length(hour))

# Count rental event by weeday and hour
weekday_summary <- ddply(SummaryDateAndTime,.(weekday,hour),summarise, N = length(hour))

Visualize DC bike share data at different year (2010 only has Q4 data, and 2016 only has Q1, Q2 data)

ggplot(data=year_summary, aes(x=year, y=N)) +
  geom_bar(stat="identity")+
  # Add text at top of bars
  geom_text(data=year_summary,aes(x=year,y=N,label=N),vjust=0)

Visualize DC bike share data at different month

ggplot(SummaryDateAndTime, aes(x = hour, y = N, colour = month)) +
  geom_point(data = month_summary, aes(group = month)) +
  geom_line(data = month_summary, aes(group = month)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle("People rent more bikes from April to June.") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(plot.title=element_text(size=18))

Visualize the different rental demands at weekday and weekend

ggplot(SummaryDateAndTime, aes(x = hour, y = N, colour = weekday)) +
  geom_point(data = weekday_summary, aes(group = weekday)) +
  geom_line(data = weekday_summary, aes(group = weekday)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle("Different demands in weekday and weekend.\n") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(plot.title=element_text(size=18))

Create a dataframe with start station number and day

# Create a dataframe with StartStationNumber and Weekday
StartStationinWeek.df <- data.frame(FinalDataNew$StartStationNumber, SummaryDateAndTime$weekday)
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'SummaryWeekday')
head(StartStationinWeek.df)
##   StartStationNumber SummaryWeekday
## 1              31111         Friday
## 2              31111         Friday
## 3              31602         Friday
## 4              31106         Friday
## 5              31110         Friday
## 6              31107         Friday

Group the days by weekday and weekend

weekends <- c('Saturday', 'Sunday') #define Sat and Sun as weekend
# If the day is Saturday or Sunday, labels the day as "weekend", other wise labels it as "weekday"
StartStationinWeek.df$SummaryWeekday <- factor((StartStationinWeek.df$SummaryWeekday %in% weekends), 
       levels=c(TRUE, FALSE), labels=c('weekend', 'weekday')) 
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'WeekdayOrWeekend')
head(StartStationinWeek.df)
##   StartStationNumber WeekdayOrWeekend
## 1              31111          weekday
## 2              31111          weekday
## 3              31602          weekday
## 4              31106          weekday
## 5              31110          weekday
## 6              31107          weekday

Count the rental event by station and weekday/weekend

library(plyr)
SummaryStartStationinWeek.df <- count(StartStationinWeek.df, c('StartStationNumber', 'WeekdayOrWeekend')) 
head(SummaryStartStationinWeek.df)
##   StartStationNumber WeekdayOrWeekend  freq
## 1              31000          weekend  2149
## 2              31000          weekday  5928
## 3              31001          weekend  6289
## 4              31001          weekday 14583
## 5              31002          weekend  7229
## 6              31002          weekday 17676

Find the top rental stations in weekday, weekend, and all week

SummaryStationWeekday.df <- subset(SummaryStartStationinWeek.df[c(1,3)], 
                                  SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekday')
SummaryStationWeekend.df <- subset(SummaryStartStationinWeek.df[c(1,3)], 
                                  SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekend')

# Scale the freq in weekday and weekend
SummaryStationWeekday.df$freq <- SummaryStationWeekday.df$freq/5
SummaryStationWeekend.df$freq <- SummaryStationWeekend.df$freq/2

# Sort the Station by freq (from high to low)
SummaryStationWeekday.df <- SummaryStationWeekday.df[order(-SummaryStationWeekday.df$freq),]
SummaryStationWeekend.df <- SummaryStationWeekend.df[order(-SummaryStationWeekend.df$freq),]

# Find the top 5 rental station in weekday
Top5StationInWeekday.df <- SummaryStationWeekday.df[1:5,]
TopStationInWeekday.df <- cbind(Top5StationInWeekday.df[1], 'Weekday')
colnames(TopStationInWeekday.df)[2] <- 'Week'

# Find the top 5 rental station in weekend
Top5StationInWeekend.df <- SummaryStationWeekend.df[1:5,]
TopStationInWeekend.df <- cbind(Top5StationInWeekend.df[1], 'Weekend')
colnames(TopStationInWeekend.df)[2] <- 'Week'

# Merge the top 5 rental staions in weekday and weekend
TopStationInWeek.df <- merge(TopStationInWeekday.df,TopStationInWeekend.df, by = 'StartStationNumber', all = TRUE)
TopStationInWeek.df$Week.x <- as.character(TopStationInWeek.df$Week.x)

# If the station in included in both weekday and weekend stations label it as top stations all week
TopStationInWeek.df$Week.x[!is.na(TopStationInWeek.df$Week.x) & !is.na(TopStationInWeek.df$Week.y)] <- 'AllWeek'
TopStationInWeek.df$Week.x[is.na(TopStationInWeek.df$Week.x)] <- 'Weekend'
TopStationInWeek.df <- TopStationInWeek.df[1:2]
colnames(TopStationInWeek.df)[2] <- 'TopRentalStations'
TopStationInWeek.df
##   StartStationNumber TopRentalStations
## 1              31101           Weekend
## 2              31200           AllWeek
## 3              31201           AllWeek
## 4              31229           Weekday
## 5              31623           Weekday
## 6              31241           Weekday
## 7              31247           Weekend
## 8              31258           Weekend

Extract the GPS info of stations from the online XML file

Find the overall top 10 stations

# Count rental demand in each hour by start station
HourlyDemandPrediction.df <- count(DemandPredictionDataSet,c(1,3,4,5,7))

# Count rental demand in each hour by end station
HourlyDemandPrediction.end.df <- count(DemandPredictionDataSet,c(9,10,11,13,14))
TopStations <- count(DemandPredictionDataSet,c('StartStationNumber'))
TopStations$StartStationNumber <- as.numeric(as.character(TopStations$StartStationNumber))
# Sort the stations by frequency
TopStations <- TopStations[order(-TopStations$freq), ]
# Get top 10 stations 
Top10Stations <- head(TopStations$StartStationNumber,10)
Top10Stations
##  [1] 31200 31623 31201 31258 31247 31229 31214 31101 31241 31613

Visualize rental demand in Station X by hour and weekday

# Top X station
X <- 1
StationX <- Top10Stations[X]

# Create a dataset for the rental demand in Station X by hour and weekday

# Get the data only for Station X
StationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$StartStationNumber == StationX)

# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))

# Count the returned bike in Station X by hour
EndStationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
                         summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')

# Count the rental demand in Station X by hour and weekday
StationX.DayAndHour <- aggregate(N ~ weekday + hour, data = StationX.summary, FUN = sum)

# Visualize the rental demand in Station X by hour and weekday
ggplot(StationX.DayAndHour, aes(x = as.factor(hour), y = N, colour = weekday)) +
  geom_point(data = StationX.DayAndHour, aes(group = weekday)) +
  geom_line(data = StationX.DayAndHour, aes(group = weekday)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle(sprintf("Rental Demand in Station %d", Top10Stations[X])) + 
  theme_minimal() +
  theme(plot.title=element_text(size=18))+
  theme(plot.title = element_text(hjust = 0.5))

Visualize rental demand in Station X by hour and month in 3D

library(plotly)
# Get the rental demand of station X on Mondays at each hour and each month 
Demand <- xtabs(N ~ hour + month, data = subset(StationX.summary, StationX.summary$weekday == 'Monday'))
plot_ly(x = 1:12, z = Demand, type = "surface") %>%
     layout(scene = list(title = "Rental Demand on Saturdays in Station 31623, x = month, y= hour, z1 = count", xaxis = list(title = "Month"), yaxis = list(title = "Hour"), zaxis = list(title = "Demand")))

Prepare functions for create the predictive model

Polynomial model with the input of station number, year, month, weekday, and hour

# Stations that use 12th polynomial model
Model1Station <- c(31623, 31201, 31258, 31247, 31229, 31214, 31101, 31613)

# Stations that use 10th polynomial model
Model2Station <- c(31200, 31241)

# 12th polynomial model for weekday 
weekdayPolyModel1 <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
  dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
  dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
  dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
  dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
  dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
  dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
  dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
  dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
  dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
  dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
  dataset.out.weekday$hour_p11 <- dataset.out.weekday$hour^11
  dataset.out.weekday$hour_p12 <- dataset.out.weekday$hour^12
  # Fitting the polynomial regression
  regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4
                      +hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10+hour_p11+hour_p12,
                       data = dataset.out.weekday)
  # Function returns the regressor for weekday
  return(regressor.weekday)
}

# 10th polynomial model for weekday 
weekdayPolyModel2 <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
  dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
  dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
  dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
  dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
  dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
  dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
  dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
  dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
  dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
  dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
  # Fitting the polynomial regression
  regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4+
                            hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10,
                       data = dataset.out.weekday)
  return(regressor.weekday)
}

# 4th polynomial model for weekend
weekendPolyModel <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekend$year_p2 <- dataset.out.weekend$year^2
  dataset.out.weekend$month_p2 <- dataset.out.weekend$month^2
  dataset.out.weekend$hour_p2 <- dataset.out.weekend$hour^2
  dataset.out.weekend$hour_p3 <- dataset.out.weekend$hour^3
  dataset.out.weekend$hour_p4 <- dataset.out.weekend$hour^4
  # Fitting the polynomial regression
  regressor.weekend <- lm(formula = out ~ year+year_p2+month+month_p2+
                                    hour+hour_p2+hour_p3+hour_p4,
                       data = dataset.out.weekend)
  # Function returns the regressor for weekday
  return(regressor.weekend)
}

# Function to select the model for weekday prediction
predictStationXinWeekday <-  function(SelectStation){
  # If the station number is in Model1 then use Model1 (12th poly)
  if (SelectStation %in% Model1Station){
    RegressorWeekday = weekdayPolyModel1(dataset.out.weekday)
  }
  # If the station number is in Model2 then use Model2 (10th poly)
  if (SelectStation %in% Model2Station){
    RegressorWeekday = weekdayPolyModel2(dataset.out.weekday)
  } 
  return(RegressorWeekday)
}

# Function to use model (4th poly) for weekend prediction
predictStationXinWeekend <-  function(SelectStation){
  RegressorWeekend = weekendPolyModel(dataset.out.weekend)
  return(RegressorWeekend)
}

# Function to create the predicted dataset in weekday
predictedWeekdayDemand <- function(SelectStation,Year,Month,Hour){
  # Select the model 1 (12th poly)
  if (SelectStation %in% Model1Station){
    modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
                            hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
                            hour_p9 = Hour^9, hour_p10 = Hour^10, hour_p11 = Hour^11, hour_p12 = Hour^12)  
  }
  # Select the model 2 (10th poly)
  if (SelectStation %in% Model2Station){
    modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
                            hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
                            hour_p9 = Hour^9, hour_p10 = Hour^10) 
  }
  # Create the predicted dataset in weekday
  predictedWeekdayData = cbind.data.frame(hour = Hour, 
                                          demand = predict(myPolyModelWeekday,newdata = modeldata))
  # Return the predicted dataset in weekday
  return(predictedWeekdayData)
}

# Function to create the predicted dataset in weekend
predictedWeekendDemand <- function(SelectStation,Year,Month,Hour){
  # Use 4th poly model
  modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4)  
  # Create the predicted dataset in weekend
  predictedWeekendData = cbind.data.frame(hour = Hour, 
                                             demand = predict(myPolyModelWeekend,newdata = modeldata))
  # Return the predicted dataset in weekend
  return(predictedWeekendData)
}

Predict the rental demand in Station X

# Select station X
X <- 9
StationX <- Top10Stations[X]

# Create a dataset for the rental demand in Station X by hour and weekday

# Count the rental demand in Station X by hour
StationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$StartStationNumber == StationX)

# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))

# Count the returned bike in Station X by hour
EndStationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
                         summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')
StationX.demand.detail <- merge.data.frame(StationX.summary.detail, EndStationX.summary.detail,
                                   by = c('year','month','day','weekday','hour'),
                                   all = TRUE)
StationX.demand.detail[is.na(StationX.demand.detail)] <- 0
colnames(StationX.demand.detail)[6:7] <- c('out','In')
StationX.demand.detail$netout <- StationX.demand.detail$out - StationX.demand.detail$In
dataset.out <- StationX.demand.detail[c(1:5,6)]  

# Create the dataset for weekday and weekend
weekend.position <- dataset.out$weekday == 'Saturday' | dataset.out$weekday == 'Sunday'
dataset.out.weekday <- subset(dataset.out, weekend.position == FALSE)
dataset.out.weekend <- subset(dataset.out, weekend.position == TRUE)
PredictYear <- 2015
PredictMonth <- 6
hour_grid <- seq(0, 23, 0.1)

# Predict weekday rental demand in Station X
myPolyModelWeekday <- predictStationXinWeekday(StationX)
summary(myPolyModelWeekday)
## 
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour + 
##     hour_p2 + hour_p3 + hour_p4 + hour_p5 + hour_p6 + hour_p7 + 
##     hour_p8 + hour_p9 + hour_p10, data = dataset.out.weekday)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.4898  -2.5461  -0.3761   1.9984  28.9818 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.134e+06  6.294e+04  -18.01   <2e-16 ***
## year         1.126e+03  6.251e+01   18.01   <2e-16 ***
## year_p2     -2.794e-01  1.552e-02  -18.00   <2e-16 ***
## month        1.704e+00  3.494e-02   48.77   <2e-16 ***
## month_p2    -1.195e-01  2.608e-03  -45.83   <2e-16 ***
## hour        -7.479e+00  7.435e-01  -10.06   <2e-16 ***
## hour_p2      1.317e+01  9.194e-01   14.32   <2e-16 ***
## hour_p3     -8.281e+00  4.476e-01  -18.50   <2e-16 ***
## hour_p4      2.462e+00  1.139e-01   21.61   <2e-16 ***
## hour_p5     -3.984e-01  1.710e-02  -23.30   <2e-16 ***
## hour_p6      3.794e-02  1.595e-03   23.79   <2e-16 ***
## hour_p7     -2.191e-03  9.348e-05  -23.43   <2e-16 ***
## hour_p8      7.552e-05  3.351e-06   22.54   <2e-16 ***
## hour_p9     -1.432e-06  6.708e-08  -21.34   <2e-16 ***
## hour_p10     1.149e-08  5.745e-10   20.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.262 on 23974 degrees of freedom
## Multiple R-squared:  0.4867, Adjusted R-squared:  0.4864 
## F-statistic:  1624 on 14 and 23974 DF,  p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekdayData = predictedWeekdayDemand(StationX,PredictYear,PredictMonth,hour_grid)

# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekday$year == PredictYear & dataset.out.weekday$month == PredictMonth
plotData <-  dataset.out.weekday[PointToPlot1, c(5,6)]

# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekday <- ggplot()+
  geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
  geom_line(data = predictedWeekdayData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
  ggtitle(paste("Weekday Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
  xlab('Hour') +
  ylab('Demand')+
  theme(plot.title = element_text(hjust = 0.5))
plotWeekday

#Predict weekend rental demand in Station X
myPolyModelWeekend <- predictStationXinWeekend(StationX)
summary(myPolyModelWeekend)
## 
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour + 
##     hour_p2 + hour_p3 + hour_p4, data = dataset.out.weekend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7333 -1.8686 -0.4087  1.3897 17.5835 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.406e+05  6.797e+04  -13.84   <2e-16 ***
## year         9.342e+02  6.750e+01   13.84   <2e-16 ***
## year_p2     -2.320e-01  1.676e-02  -13.84   <2e-16 ***
## month        1.263e+00  3.809e-02   33.15   <2e-16 ***
## month_p2    -9.131e-02  2.838e-03  -32.18   <2e-16 ***
## hour        -1.299e+00  8.157e-02  -15.93   <2e-16 ***
## hour_p2      3.418e-01  1.478e-02   23.13   <2e-16 ***
## hour_p3     -2.210e-02  9.520e-04  -23.22   <2e-16 ***
## hour_p4      4.223e-04  2.010e-05   21.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.876 on 9526 degrees of freedom
## Multiple R-squared:  0.3481, Adjusted R-squared:  0.3475 
## F-statistic: 635.7 on 8 and 9526 DF,  p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekendData = predictedWeekendDemand(StationX,PredictYear,PredictMonth,hour_grid)

# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekend$year == PredictYear & dataset.out.weekend$month == PredictMonth
plotData <-  dataset.out.weekend[PointToPlot1, c(5,6)]

# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekend <- ggplot()+
  geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
  geom_line(data = predictedWeekendData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
  ggtitle(paste("Weekend Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
  xlab('Hour') +
  ylab('Demand')+
  theme(plot.title = element_text(hjust = 0.5))
plotWeekend